{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "639f468f-4196-4dbb-b92c-9125d89251ea",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import os\n",
    "from linearmodels import PanelOLS\n",
    "from linearmodels.panel import PooledOLS\n",
    "import statsmodels.api as sm\n",
    "\n",
    "cd_data = '.../Data/Intraday/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "682fc616-a12f-49be-b39b-d04aec01921d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load main datasets\n",
    "# Firm sentiment\n",
    "df_firm = pd.read_parquet(cd_data + 'Congress_tweets.parquet')\n",
    "df_firm['Date'] = pd.to_datetime(df_firm['Date'])\n",
    "\n",
    "# Firm stock returns around tweets\n",
    "df_returns  = pd.read_parquet(cd_data + 'Returns_around_tweets_1m_5m.parquet')\n",
    "df_returns['stock_return'] = (np.log(df_returns['price_after'] )  - np.log(df_returns['price_before'] ))*100*100\n",
    "df_returns['stock_return_SPY'] = (np.log(df_returns['price_spy_after'] )  - np.log(df_returns['price_spy_before'] ))*100*100\n",
    "df_returns['stock_return_excess'] =  df_returns['stock_return']  - df_returns['stock_return_SPY'] \n",
    "\n",
    "df_returns = df_returns.rename(columns ={'Date_tuit':'Date'})\n",
    "df_returns['Date'] = pd.to_datetime(df_returns['Date'])\n",
    "df_returns['Date_before'] = pd.to_datetime(df_returns['Date_before'])\n",
    "df_returns['Date_after'] = pd.to_datetime(df_returns['Date_after'])\n",
    "\n",
    "df_firm_returns = pd.merge(df_firm, df_returns, how = 'left', on = ['Date','ID','permno'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2e33c58a-9d23-4100-91a8-892a29e4f4f5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Scatter plot figure\n",
    "df_plot = df_firm_returns.copy()\n",
    "df_plot['quantile_ex_3'] = pd.qcut(df_plot['tone'], 10,duplicates='drop')\n",
    "df_plot = df_plot.groupby(['quantile_ex_3'])[['tone','stock_return_excess','stock_return']].mean().reset_index().reset_index()\n",
    "\n",
    "fig, axes = plt.subplots(1, 2 , figsize=(20,10), sharex=True, sharey=False)\n",
    "for ii, ax in enumerate(axes.flatten()):\n",
    "    fig.add_subplot(ax)\n",
    "    \n",
    "    ax = plt.gca()\n",
    "    if ii == 0: \n",
    "        x = df_plot['tone'].values\n",
    "        y = df_plot['stock_return'].values\n",
    "\n",
    "        ax.scatter(x , y,s=100, color = 'darkblue')\n",
    "\n",
    "        m, b = np.polyfit(x, y, 1)\n",
    "        plt.plot(x, m*x+b,color ='r',linewidth=3)\n",
    "\n",
    "        plt.ylabel('Stock returns in basis points', fontsize='25')\n",
    "        plt.xlabel('Standardized tone measure', fontsize='25')\n",
    "\n",
    "        plt.ylim([-8,8])\n",
    "        ax.spines['top'].set_visible(False)\n",
    "        ax.spines['right'].set_visible(False)  \n",
    "        ax.xaxis.set_tick_params(labelsize=25)\n",
    "        ax.yaxis.set_tick_params(labelsize=25)\n",
    "        plt.tight_layout()\n",
    "             \n",
    "    if ii == 1: \n",
    "        x = df_plot['tone'].values\n",
    "        y = df_plot['stock_return_excess'].values\n",
    "\n",
    "        ax.scatter(x , y,s=100, color = 'darkblue')\n",
    "\n",
    "        m, b = np.polyfit(x, y, 1)\n",
    "        plt.plot(x, m*x+b,color ='r',linewidth=3)\n",
    "\n",
    "        plt.ylabel('Abnormal stock returns in basis points', fontsize='25')\n",
    "        plt.xlabel('Standardized tone measure', fontsize='25')\n",
    "\n",
    "        plt.ylim([-8,8])\n",
    "        ax.spines['top'].set_visible(False)\n",
    "        ax.spines['right'].set_visible(False)  \n",
    "        ax.xaxis.set_tick_params(labelsize=25)\n",
    "        ax.yaxis.set_tick_params(labelsize=25)\n",
    "        plt.tight_layout() \n",
    "        \n",
    "plt.subplots_adjust(wspace=0.15, hspace=0.0)\n",
    "plt.margins(x=0, y=0)\n",
    "plt.tight_layout()  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "07e56f06-9f17-447a-8fe6-363b70e2ff47",
   "metadata": {},
   "outputs": [],
   "source": [
    "#  Firm-level controls\n",
    "data_reg_hf_controls = df_firm_returns.copy()\n",
    "time_window = 10 #  Benchmark is 10\n",
    "data_reg_hf_controls['Date_before_controls'] = data_reg_hf_controls['Date_before']   - pd.to_timedelta(time_window, unit='m')\n",
    "data_reg_hf_controls['Date_after_controls'] = data_reg_hf_controls['Date_after']   + pd.to_timedelta(time_window, unit='m')\n",
    "\n",
    "def lookup_sentiment_firm_news(row, input_df, sentiment_select):\n",
    "    id_mask = input_df['SYM_ROOT'] == row['SYM_ROOT']\n",
    "    temp_df = input_df.loc[id_mask].copy()\n",
    "    temp_df = temp_df[(temp_df['date'] >= row['Date_before_controls']) & (temp_df['date'] <= row['Date_after_controls'])]\n",
    "    if not temp_df.empty:\n",
    "        return temp_df.iloc[0][sentiment_select]\n",
    "    else:\n",
    "        return np.nan\n",
    "    \n",
    "#----------------------------------------------------------------------------------------------------------------\n",
    "#   News articles concering the target companies\n",
    "df_news =pd.read_parquet(cd_data +  'News_headlines.parquet')\n",
    "df_news = df_news.drop(['title'], axis = 1)\n",
    "df_news['date'] = pd.to_datetime(df_news['date'])\n",
    "data_reg_hf_controls['Sentiment_Dictionary_news'] = data_reg_hf_controls.apply(lookup_sentiment_firm_news, input_df = df_news,sentiment_select='Sentiment_Dictionary_news', axis=1)\n",
    "data_reg_hf_controls['Sentiment_Dictionary_news'] = data_reg_hf_controls['Sentiment_Dictionary_news'].fillna(0)\n",
    "\n",
    "#----------------------------------------------------------------------------------------------------------------\n",
    "#  Tweets from major news media outlets\n",
    "df_NewsTweets =pd.read_parquet(cd_data +  'News_tweets.parquet')\n",
    "df_NewsTweets['date'] = pd.to_datetime(df_NewsTweets['date'])\n",
    "data_reg_hf_controls['Sentiment_Dictionary_newsTweets'] = data_reg_hf_controls.apply(lookup_sentiment_firm_news, input_df = df_NewsTweets,sentiment_select='Sentiment_Dictionary_newsTweets', axis=1)\n",
    "data_reg_hf_controls['Sentiment_Dictionary_newsTweets'] = data_reg_hf_controls['Sentiment_Dictionary_newsTweets'].fillna(0)\n",
    "\n",
    "def lookup_firm_announcements(row, input_df, sentiment_select):\n",
    "    id_mask = input_df['SYM_ROOT'] == row['SYM_ROOT']\n",
    "    temp_df = input_df.loc[id_mask].copy()\n",
    "    temp_df = temp_df[(temp_df['date'] >= row['Date_before_controls']) & (temp_df['date'] <= row['Date_after_controls'])]\n",
    "    if not temp_df.empty:\n",
    "        return temp_df.iloc[0][sentiment_select]\n",
    "    else:\n",
    "        return np.nan\n",
    "    \n",
    "#----------------------------------------------------------------------------------------------------------------\n",
    "#  Analysts Forecast errors Revenue and EPS\n",
    "df_ForecastErrors_SAL =pd.read_parquet(cd_data +  'ForecastErrors_SAL.parquet')\n",
    "df_ForecastErrors_SAL['date'] = pd.to_datetime(df_ForecastErrors_SAL['date'])\n",
    "data_reg_hf_controls['Announcements_FE_SAL'] = data_reg_hf_controls.apply(lookup_firm_announcements, input_df = df_ForecastErrors_SAL,sentiment_select='FE_SAL', axis=1)\n",
    "data_reg_hf_controls['Announcements_FE_SAL'] = data_reg_hf_controls['Announcements_FE_SAL'].fillna(0)\n",
    "  \n",
    "df_ForecastErrors_EPS =pd.read_parquet(cd_data +  'ForecastErrors_EPS.parquet')\n",
    "df_ForecastErrors_EPS['date'] = pd.to_datetime(df_ForecastErrors_EPS['date'])\n",
    "data_reg_hf_controls['Announcements_FE_EPS'] = data_reg_hf_controls.apply(lookup_firm_announcements, input_df = df_ForecastErrors_EPS,sentiment_select='FE_EPS', axis=1)\n",
    "data_reg_hf_controls['Announcements_FE_EPS'] = data_reg_hf_controls['Announcements_FE_EPS'].fillna(0)\n",
    "       \n",
    "#----------------------------------------------------------------------------------------------------------------\n",
    "#  Management Guidance errors Revenue and EPS   \n",
    "    \n",
    "df_ForecastErrors_FIRM_SAL =pd.read_parquet(cd_data +  'ForecastErrors_FIRM_SAL.parquet')      \n",
    "data_reg_hf_controls['Announcements_FE_FIRM_SAL'] = data_reg_hf_controls.apply(lookup_firm_announcements, input_df = df_ForecastErrors_FIRM_SAL,sentiment_select='FE_FIRM_SAL', axis=1)\n",
    "data_reg_hf_controls['Announcements_FE_FIRM_SAL'] = data_reg_hf_controls['Announcements_FE_FIRM_SAL'].fillna(0) \n",
    "\n",
    "df_ForecastErrors_FIRM_EPS =pd.read_parquet(cd_data +  'ForecastErrors_FIRM_EPS.parquet')      \n",
    "data_reg_hf_controls['Announcements_FE_FIRM_EPS'] = data_reg_hf_controls.apply(lookup_firm_announcements, input_df = df_ForecastErrors_FIRM_EPS,sentiment_select='FE_FIRM_EPS', axis=1)\n",
    "data_reg_hf_controls['Announcements_FE_FIRM_EPS'] = data_reg_hf_controls['Announcements_FE_FIRM_EPS'].fillna(0) \n",
    "\n",
    "def lookup_firm_announcements_fr(row, input_df, sentiment_select):\n",
    "    id_mask = input_df['permno'] == row['permno']\n",
    "    temp_df = input_df.loc[id_mask].copy()\n",
    "    temp_df = temp_df[(temp_df['date'] >= row['Date_before_controls']) & (temp_df['date'] <= row['Date_after_controls'])]\n",
    "    if not temp_df.empty:\n",
    "        return temp_df.iloc[0][sentiment_select]\n",
    "    else:\n",
    "        return np.nan\n",
    "\n",
    "#----------------------------------------------------------------------------------------------------------------\n",
    "#  Forecast Revision Revenue and EPS      \n",
    "df_ForecastRevision_SAL  =pd.read_parquet(cd_data +  'ForecastRevision_SAL.parquet')\n",
    "df_ForecastRevision_SAL = df_ForecastRevision_SAL.rename(columns = {'anndats_analyst':'date','ticker':'SYM_ROOT'})\n",
    "data_reg_hf_controls['Announcements_FR_SAL' ] = data_reg_hf_controls.apply(lookup_firm_announcements_fr, input_df = df_ForecastRevision_SAL,sentiment_select='FR_SAL', axis=1)\n",
    "data_reg_hf_controls['Announcements_FR_SAL' ] = data_reg_hf_controls['Announcements_FR_SAL'].fillna(0) \n",
    "\n",
    "df_ForecastRevision_EPS  =pd.read_parquet(cd_data +  'ForecastRevision_EPS.parquet')\n",
    "df_ForecastRevision_EPS = df_ForecastRevision_EPS.rename(columns = {'anndats_analyst':'date','ticker':'SYM_ROOT'})\n",
    "data_reg_hf_controls['Announcements_FR_EPS' ] = data_reg_hf_controls.apply(lookup_firm_announcements_fr, input_df = df_ForecastRevision_EPS,sentiment_select='FR_EPS', axis=1)\n",
    "data_reg_hf_controls['Announcements_FR_EPS' ] = data_reg_hf_controls['Announcements_FR_EPS'].fillna(0) \n",
    "\n",
    "\n",
    "def lookup_macro_announcements(row, input_df):\n",
    "    input_df = input_df[(input_df['date'] >= row['Date_before_controls']) & (input_df['date'] <= row['Date_after_controls'])]\n",
    "    if not input_df.empty:\n",
    "        return input_df.iloc[0]['surprise_S']\n",
    "    else:\n",
    "        return np.nan\n",
    "    \n",
    "#----------------------------------------------------------------------------------------------------------------\n",
    "#  Macroeconomic announcements   \n",
    "df_macro = pd.read_parquet(cd_data + 'macro_controls.parquet')\n",
    "df_macro['date'] = pd.to_datetime(df_macro['date'])\n",
    "data_reg_hf_controls['Announcements_macro'] = data_reg_hf_controls.apply(lookup_macro_announcements, input_df = df_macro, axis=1)\n",
    "data_reg_hf_controls['Announcements_macro' ] = data_reg_hf_controls['Announcements_macro'].fillna(0) \n",
    "    \n",
    "#----------------------------------------------------------------------------------------------------------------\n",
    "# Bloomberg sentiment measure - daily frequency\n",
    "data_reg_hf_controls['date_day'] = pd.to_datetime(data_reg_hf_controls['Date'].dt.date)\n",
    "df_Bloomberg_daily = pd.read_parquet(cd_data + 'BloombergSentiment.parquet')\n",
    "df_Bloomberg_daily['date_day']      = pd.to_datetime(df_Bloomberg_daily['date'])\n",
    "data_reg_hf_controls = pd.merge(data_reg_hf_controls, df_Bloomberg_daily[['date_day','permno','BloombergSentiment']], how = 'left', on = ['date_day','permno'])\n",
    "data_reg_hf_controls = data_reg_hf_controls.drop(['date_day'], axis = 1)\n",
    "data_reg_hf_controls['BloombergSentiment' ] = data_reg_hf_controls['BloombergSentiment'].fillna(0) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "731c290c-fe47-4985-b5b3-957b8afa153d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Table 1\n",
    "exog_var_results = ['a','a_t','b','b_t', '$R$-squared (\\%)','Observations','Firm-level controls','Macro-level controls']\n",
    "all_regs = [1, 2,3,4]\n",
    "coef_results = pd.DataFrame(columns = all_regs, index = exog_var_results )\n",
    "df_regression = data_reg_hf_controls.copy()\n",
    "\n",
    "#------------------------------------------------------------------\n",
    "# Column~1. No controls\n",
    "#------------------------------------------------------------------\n",
    "index_reg = 1\n",
    "return_select = 'stock_return'\n",
    "index = ['permno', 'Date']\n",
    "exog_vars_1 = ['tone' ] \n",
    "\n",
    "var_select = [return_select]\n",
    "df_reg = df_regression.copy()\n",
    "df_reg = df_reg[var_select + exog_vars_1 + index].dropna()\n",
    "df_reg = df_reg.set_index(index)    \n",
    "y =  df_reg[var_select]  \n",
    "x = sm.add_constant(df_reg[exog_vars_1])\n",
    "mod = PanelOLS(y, x , entity_effects=True)\n",
    "results = mod.fit(cov_type='clustered', cluster_entity=True,cluster_time = True)\n",
    "params = results.params\n",
    "tvalues = results.tstats\n",
    "r2 = results.rsquared*100\n",
    "nobs = results.nobs\n",
    "\n",
    "coef_results[index_reg].loc['a'] = params[0]\n",
    "coef_results[index_reg].loc['a_t'] = tvalues[0]\n",
    "coef_results[index_reg].loc['b'] = params[1]\n",
    "coef_results[index_reg].loc['b_t'] = tvalues[1]\n",
    "coef_results[index_reg].loc['$R$-squared (\\%)'] = r2\n",
    "coef_results[index_reg].loc['Observations'] = nobs\n",
    "coef_results[index_reg].loc['Firm-level controls'] =  'No'\n",
    "coef_results[index_reg].loc['Macro-level controls'] = 'No'\n",
    "\n",
    "#------------------------------------------------------------------\n",
    "# Column~2. Firm-level controls\n",
    "#------------------------------------------------------------------\n",
    "index_reg = 2\n",
    "\n",
    "\n",
    "return_select = 'stock_return'\n",
    "exog_vars_1 = ['tone', 'cum_return','ILLIQ','Sentiment_Dictionary_news',  'Sentiment_Dictionary_newsTweets', 'Announcements_FE_SAL',    'Announcements_FE_EPS',    'Announcements_FE_FIRM_SAL', 'Announcements_FE_FIRM_EPS', 'Announcements_FR_SAL', 'Announcements_FR_EPS', 'BloombergSentiment']\n",
    "var_select = [return_select]\n",
    "\n",
    "df_reg = df_regression.copy()\n",
    "df_reg = df_reg[var_select + exog_vars_1 + index].dropna()\n",
    "df_reg = df_reg.set_index(index)    \n",
    "y =  df_reg[var_select]  \n",
    "x = sm.add_constant(df_reg[exog_vars_1])\n",
    "mod = PanelOLS(y, x ,  entity_effects=True)\n",
    "results = mod.fit(cov_type='clustered', cluster_entity=True,cluster_time = True)\n",
    "params = results.params\n",
    "tvalues = results.tstats\n",
    "r2 = results.rsquared*100\n",
    "nobs = results.nobs\n",
    "\n",
    "coef_results[index_reg].loc['a'] = params[0]\n",
    "coef_results[index_reg].loc['a_t'] = tvalues[0]\n",
    "coef_results[index_reg].loc['b'] = params[1]\n",
    "coef_results[index_reg].loc['b_t'] = tvalues[1]\n",
    "coef_results[index_reg].loc['$R$-squared (\\%)'] = r2\n",
    "coef_results[index_reg].loc['Observations'] = nobs\n",
    "coef_results[index_reg].loc['Firm-level controls'] =  'Yes'\n",
    "coef_results[index_reg].loc['Macro-level controls'] = 'No'\n",
    "\n",
    "\n",
    "#------------------------------------------------------------------\n",
    "# Column~3. Firm-level controls and Macro-level controls\n",
    "#------------------------------------------------------------------\n",
    "index_reg = 3\n",
    "\n",
    "return_select = 'stock_return'\n",
    "exog_vars_1 = ['tone', 'cum_return','ILLIQ','Sentiment_Dictionary_news',  'Sentiment_Dictionary_newsTweets', 'Announcements_FE_SAL',    'Announcements_FE_EPS',    'Announcements_FE_FIRM_SAL', 'Announcements_FE_FIRM_EPS', 'Announcements_FR_SAL', 'Announcements_FR_EPS', 'BloombergSentiment','Announcements_macro']\n",
    "var_select = [return_select]\n",
    "df_reg = df_regression.copy()\n",
    "\n",
    "df_reg = df_reg[var_select + exog_vars_1 + index].dropna()\n",
    "df_reg = df_reg.set_index(index)    \n",
    "y =  df_reg[var_select]  \n",
    "x = sm.add_constant(df_reg[exog_vars_1])\n",
    "mod = PanelOLS(y, x ,entity_effects=True)\n",
    "results = mod.fit(cov_type='clustered', cluster_entity=True,cluster_time = True)\n",
    "params = results.params\n",
    "tvalues = results.tstats\n",
    "r2 = results.rsquared*100\n",
    "nobs = results.nobs\n",
    "\n",
    "coef_results[index_reg].loc['a'] = params[0]\n",
    "coef_results[index_reg].loc['a_t'] = tvalues[0]\n",
    "coef_results[index_reg].loc['b'] = params[1]\n",
    "coef_results[index_reg].loc['b_t'] = tvalues[1]\n",
    "coef_results[index_reg].loc['$R$-squared (\\%)'] = r2\n",
    "coef_results[index_reg].loc['Observations'] = nobs\n",
    "coef_results[index_reg].loc['Firm-level controls'] =  'Yes'\n",
    "coef_results[index_reg].loc['Macro-level controls'] = 'Yes'\n",
    "\n",
    "\n",
    "#------------------------------------------------------------------\n",
    "# Column~3. Firm-level controls and Macro-level controls and abnormal returns\n",
    "#------------------------------------------------------------------\n",
    "index_reg = 4\n",
    "return_select = 'stock_return_excess'\n",
    "exog_vars_1 = ['tone', 'cum_return','ILLIQ','Sentiment_Dictionary_news',  'Sentiment_Dictionary_newsTweets', 'Announcements_FE_SAL',    'Announcements_FE_EPS',    'Announcements_FE_FIRM_SAL', 'Announcements_FE_FIRM_EPS', 'Announcements_FR_SAL', 'Announcements_FR_EPS', 'BloombergSentiment','Announcements_macro']\n",
    "\n",
    "df_reg = df_regression.copy()\n",
    "var_select = [return_select]\n",
    "df_reg = df_reg[var_select + exog_vars_1 + index].dropna()\n",
    "df_reg = df_reg.set_index(index)    \n",
    "y =  df_reg[var_select]  \n",
    "x = sm.add_constant(df_reg[exog_vars_1])\n",
    "mod = PanelOLS(y, x , entity_effects=True)\n",
    "results = mod.fit(cov_type='clustered', cluster_entity=True,cluster_time = True)\n",
    "params = results.params\n",
    "tvalues = results.tstats\n",
    "r2 = results.rsquared*100\n",
    "nobs = results.nobs\n",
    "\n",
    "coef_results[index_reg].loc['a'] = params[0]\n",
    "coef_results[index_reg].loc['a_t'] = tvalues[0]\n",
    "coef_results[index_reg].loc['b'] = params[1]\n",
    "coef_results[index_reg].loc['b_t'] = tvalues[1]\n",
    "\n",
    "coef_results[index_reg].loc['$R$-squared (\\%)'] = r2\n",
    "coef_results[index_reg].loc['Observations'] = nobs\n",
    "\n",
    "coef_results[index_reg].loc['Firm-level controls'] =  'Yes'\n",
    "coef_results[index_reg].loc['Macro-level controls'] = 'Yes'"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
